pca <- prcomp(emb)
plot(pca,type="l",col="red")
pca.sum <- summary(pca)
plot(pca.sum$importance["Cumulative Proportion",],type="l")
abline(v=50,col="red")
abline(h=0.5,col="blue")
pc <- as.data.frame(pca$x)
pcn <- normalize(pc)
w1 <- "rey"
w2 <- "reina"
w3 <- "hombre"
similares(w1,x=emb)
## word freq cos.sim
## 6051 reyes 1 0.7648581
## 3340 reino 1 0.7598407
## 3829 pr<U+00ED>ncipe 1 0.7326977
## 3279 reina 1 0.7253803
## 424843 Harthacnut 1 0.7045774
## 505451 Ragnald 1 0.7024436
## 445990 Sverkersson 1 0.7003423
## 15957 regente 1 0.6984726
## 264595 Hardeknut 1 0.6976819
## 305960 Ladul<U+00E1>s 1 0.6969224
similares(w1,x=pc)
## word freq cos.sim
## 6051 reyes 1 0.7326543
## 3340 reino 1 0.7277504
## 3829 pr<U+00ED>ncipe 1 0.6925480
## 3279 reina 1 0.6853842
## 5012 trono 1 0.6601736
## 15957 regente 1 0.6569846
## 30259 vasallo 1 0.6322269
## 264595 Hardeknut 1 0.6300042
## 28527 pretendiente 1 0.6288754
## 21283 consorte 1 0.6282198
similares(w1,x=pcn)
## word freq cos.sim
## 6051 reyes 1 0.7326543
## 3340 reino 1 0.7277504
## 3829 pr<U+00ED>ncipe 1 0.6925480
## 3279 reina 1 0.6853842
## 5012 trono 1 0.6601736
## 15957 regente 1 0.6569846
## 30259 vasallo 1 0.6322269
## 264595 Hardeknut 1 0.6300042
## 28527 pretendiente 1 0.6288754
## 21283 consorte 1 0.6282198
analogia(w1,w2,w3,x = emb)
## word freq cos.sim
## 788 joven 1 0.6321573
## 63796 f<U+00E9>mina 1 0.6205990
## 40325 jovencita 1 0.6140978
## 14136 muchacha 1 0.5765563
## 432 persona 1 0.5680321
## 3064 ni<U+00F1>a 1 0.5657318
## 606320 trasvesti 1 0.5631110
## 301462 travest<U+00ED> 1 0.5522197
## 18179 anciana 1 0.5488744
## 500250 Acuchilla 1 0.5468871
analogia(w1,w2,w3,x = pc)
## word freq cos.sim
## 63796 f<U+00E9>mina 1 0.5744859
## 788 joven 1 0.5691486
## 40325 jovencita 1 0.5578101
## 14136 muchacha 1 0.5080257
## 432 persona 1 0.5059552
## 3064 ni<U+00F1>a 1 0.5049733
## 18179 anciana 1 0.4807809
## 6425 adolescente 1 0.4715252
## 198865 sexagenaria 1 0.4687589
## 5450 chica 1 0.4678362
analogia(w1,w2,w3,x = pcn)
## word freq cos.sim
## 63796 f<U+00E9>mina 1 0.5749063
## 788 joven 1 0.5728569
## 40325 jovencita 1 0.5576341
## 432 persona 1 0.5090779
## 14136 muchacha 1 0.5080447
## 3064 ni<U+00F1>a 1 0.5045333
## 18179 anciana 1 0.4819011
## 6425 adolescente 1 0.4735706
## 198865 sexagenaria 1 0.4695254
## 5450 chica 1 0.4678327
hist(pcn$PC1)
vpc1 <- vocab[ order(pcn$PC1,decreasing = T), ]
head(vpc1,n=10)
## word freq
## 800453 <U+AD6D><U+C81C><U+C120> 1
## 819987 <U+00C1>ltal<U+00E1>nos 1
## 837692 Inspektor 1
## 913756 Ogrodniczego 1
## 803819 Seks 1
## 629651 Gospodarstwo 1
## 829811 Lundsgaarder 1
## 842692 <U+B77C><U+C6B4><U+C9C0> 1
## 994759 LEGACY 1
## 922682 actie 1
tail(vpc1,n=10)
## word freq
## 536001 venci<U+00E9>ndose 1
## 377783 estrecheza 1
## 440762 agazapan 1
## 572525 Wampagkit 1
## 324697 ocultar<U+00E1>n 1
## 481823 desembarazados 1
## 340768 import<U+00E1>ndoles 1
## 606030 cre<U+00E1>ndoles 1
## 668453 Bucellarii 1
## 428031 Esparavel 1
word.plot.dimension(pcn,target.dim = 1,sec.dim = 2,v = vocab,max.plot = 25)
last <- pcn[,ncol(pcn)]
hist(last)
vpc1 <- vocab[ order(last,decreasing = T), ]
head(vpc1,n=10)
## word freq
## 31 ha 1
## 79 hab<U+00ED>a 1
## 379 hab<U+00ED>an 1
## 41 han 1
## 275 haber 1
## 1406 habr<U+00ED>a 1
## 1002 hayan 1
## 1291 Ha 1
## 5208 habr<U+00ED>an 1
## 5002 hubiese 1
tail(vpc1,n=10)
## word freq
## 74483 devenido 1
## 11537 optado 1
## 4086 cambiado 1
## 4084 vuelto 1
## 167 estado 1
## 84 sido 1
## 2514 decidido 1
## 55737 perdurado 1
## 17010 sobrevivido 1
## 1612 llegado 1
word.plot.dimension(pcn,target.dim = ncol(pcn),sec.dim = 1,v = vocab,max.plot = 25)
Interesante: se encuentran numeros en los valores bajos de PC2 (jobwords)
word.plot.dimension(pcn,target.dim = "PC2",sec.dim = "PC1",max.plot = 10)
w <- "rey"
similares(w, x = pcn)
## word freq cos.sim
## 6051 reyes 1 0.7326543
## 3340 reino 1 0.7277504
## 3829 pr<U+00ED>ncipe 1 0.6925480
## 3279 reina 1 0.6853842
## 5012 trono 1 0.6601736
## 15957 regente 1 0.6569846
## 30259 vasallo 1 0.6322269
## 264595 Hardeknut 1 0.6300042
## 28527 pretendiente 1 0.6288754
## 21283 consorte 1 0.6282198
d <- similares(w,n = 100, x = pcn,vectors = T)
dv <- vocab[rownames(d),]
word.plot(d,dims = c(1,2),v=dv)
#pca en ese conjunto
d.pca <- prcomp(d)
summary(d.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 0.2690 0.20987 0.16515 0.14499 0.12936 0.12372
## Proportion of Variance 0.1632 0.09936 0.06153 0.04742 0.03775 0.03453
## Cumulative Proportion 0.1632 0.26258 0.32411 0.37153 0.40928 0.44381
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.11562 0.10860 0.10289 0.09674 0.09373 0.08888
## Proportion of Variance 0.03015 0.02661 0.02388 0.02111 0.01982 0.01782
## Cumulative Proportion 0.47396 0.50056 0.52444 0.54555 0.56537 0.58319
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.08691 0.08467 0.07807 0.07721 0.07571 0.07475
## Proportion of Variance 0.01704 0.01617 0.01375 0.01345 0.01293 0.01260
## Cumulative Proportion 0.60023 0.61640 0.63015 0.64360 0.65653 0.66914
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.07380 0.07147 0.06978 0.06821 0.06696 0.06586
## Proportion of Variance 0.01229 0.01152 0.01099 0.01049 0.01012 0.00979
## Cumulative Proportion 0.68142 0.69295 0.70393 0.71443 0.72454 0.73433
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.06451 0.06336 0.06241 0.06027 0.05948 0.05818
## Proportion of Variance 0.00939 0.00906 0.00879 0.00819 0.00798 0.00763
## Cumulative Proportion 0.74371 0.75277 0.76156 0.76975 0.77773 0.78536
## PC31 PC32 PC33 PC34 PC35 PC36
## Standard deviation 0.05788 0.05646 0.05600 0.05505 0.05389 0.05324
## Proportion of Variance 0.00756 0.00719 0.00707 0.00684 0.00655 0.00639
## Cumulative Proportion 0.79292 0.80011 0.80719 0.81402 0.82057 0.82697
## PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 0.05227 0.05175 0.05067 0.04955 0.04903 0.04783
## Proportion of Variance 0.00616 0.00604 0.00579 0.00554 0.00542 0.00516
## Cumulative Proportion 0.83313 0.83917 0.84496 0.85050 0.85592 0.86108
## PC43 PC44 PC45 PC46 PC47 PC48
## Standard deviation 0.04718 0.04705 0.04654 0.04563 0.04510 0.04447
## Proportion of Variance 0.00502 0.00499 0.00489 0.00470 0.00459 0.00446
## Cumulative Proportion 0.86610 0.87110 0.87598 0.88068 0.88527 0.88973
## PC49 PC50 PC51 PC52 PC53 PC54
## Standard deviation 0.04345 0.04308 0.04243 0.04176 0.04141 0.04057
## Proportion of Variance 0.00426 0.00419 0.00406 0.00393 0.00387 0.00371
## Cumulative Proportion 0.89399 0.89817 0.90223 0.90617 0.91004 0.91375
## PC55 PC56 PC57 PC58 PC59 PC60
## Standard deviation 0.03994 0.03934 0.03894 0.03812 0.03758 0.03659
## Proportion of Variance 0.00360 0.00349 0.00342 0.00328 0.00319 0.00302
## Cumulative Proportion 0.91735 0.92084 0.92426 0.92753 0.93072 0.93374
## PC61 PC62 PC63 PC64 PC65 PC66
## Standard deviation 0.03583 0.03566 0.03499 0.03476 0.03395 0.03315
## Proportion of Variance 0.00290 0.00287 0.00276 0.00272 0.00260 0.00248
## Cumulative Proportion 0.93664 0.93950 0.94227 0.94499 0.94759 0.95007
## PC67 PC68 PC69 PC70 PC71 PC72
## Standard deviation 0.03245 0.03220 0.03176 0.03140 0.03087 0.03063
## Proportion of Variance 0.00238 0.00234 0.00228 0.00222 0.00215 0.00212
## Cumulative Proportion 0.95245 0.95479 0.95706 0.95929 0.96144 0.96355
## PC73 PC74 PC75 PC76 PC77 PC78
## Standard deviation 0.03018 0.02943 0.02927 0.02898 0.02870 0.02804
## Proportion of Variance 0.00206 0.00195 0.00193 0.00189 0.00186 0.00177
## Cumulative Proportion 0.96561 0.96756 0.96949 0.97139 0.97324 0.97502
## PC79 PC80 PC81 PC82 PC83 PC84
## Standard deviation 0.02791 0.02694 0.02688 0.02634 0.02601 0.02535
## Proportion of Variance 0.00176 0.00164 0.00163 0.00156 0.00153 0.00145
## Cumulative Proportion 0.97678 0.97841 0.98004 0.98161 0.98313 0.98458
## PC85 PC86 PC87 PC88 PC89 PC90
## Standard deviation 0.02484 0.02440 0.02388 0.02324 0.02288 0.02248
## Proportion of Variance 0.00139 0.00134 0.00129 0.00122 0.00118 0.00114
## Cumulative Proportion 0.98598 0.98732 0.98860 0.98982 0.99100 0.99214
## PC91 PC92 PC93 PC94 PC95 PC96
## Standard deviation 0.02201 0.02128 0.02058 0.02046 0.01994 0.01902
## Proportion of Variance 0.00109 0.00102 0.00096 0.00094 0.00090 0.00082
## Cumulative Proportion 0.99324 0.99426 0.99521 0.99616 0.99705 0.99787
## PC97 PC98 PC99 PC100
## Standard deviation 0.01846 0.01756 0.01720 1.195e-16
## Proportion of Variance 0.00077 0.00070 0.00067 0.000e+00
## Cumulative Proportion 0.99864 0.99933 1.00000 1.000e+00
plot(d.pca$sdev,type="o",pch="+")
text(x=seq(1,100),y=d.pca$sdev,labels = paste0("PC",seq(1,100)),cex=0.5,pos=4)
Con dos primeras dimensiones palabras y numeros son separables linealmente
word.plot(d.pca$x,v=dv)
Con PC1 y PC3 (casi solo PC3) las cantidades sin “$” son separables
word.plot(d.pca$x,v=dv,dims = c("PC1","PC3"))
Con PC4 cantidades “pequeñas” son sepabales
word.plot(d.pca$x,v=dv,dims = c("PC1","PC4"))
Con PC5 no hay nada evidente, con PC6 hay algunos errores de limpieza de datos
word.plot(d.pca$x,v=dv,dims = c("PC1","PC6"))
Un k-means muy interesante:
pca.cluster.similar(w,x = pcn,v = vocab,n=200,show.center.analysis = T)
## NULL
dclust<-pca.cluster.similar(w,x = pcn,v = vocab,n = 500,centers = 11)
cluster.wordcloud(dclust,scale = c(3,0.5),by.sim=T)
## Warning in wordcloud(words = dclust.sub$word, freq = f, colors = colors, :
## Ramathibodi could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = dclust.sub$word, freq = f, colors = colors, :
## Tanutamani could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = dclust.sub$word, freq = f, colors = colors, :
## Erishum could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = dclust.sub$word, freq = f, colors = colors, :
## Fil<U+00F3>pator could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = dclust.sub$word, freq = f, colors = colors, :
## Evergetes could not be fit on page. It will not be plotted.
Comparar con el kmeans usando el espacio original:
d <- similares(w,n = 100, x = emb,vectors = T)
dv <- vocab[rownames(d),]
k <- kmeans(d[,1:8],6)
word.plot(d[,1:2],v=dv,col=k$cluster)
#head(emb,n = 10)
#similares("france",x=emb)
#Algunas estadisticas interesantes
#summary(emb$V200)
#qqplot(emb$V200,rnorm(1000,mean=mean(emb$V200),sd=sd(emb$V200)),pch=".")
#abline(a=0,b=1,col="red")
#norms<-apply(emb,1,function(x){sqrt(sum(x^2))})
#hist(norms)
#similitud promedio de los 10 primeros resultados para una muestra aleatoria de 100 palabras
#s<-sapply(vocab$V2[sample(nrow(vocab),100)],FUN=function(x){mean(similares(x)$cos.sim)})
#hist(s)